// Copyright 2024 The Google Research Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Define an aptamer selection experiment, consisting of multiple rounds of
// selection and high throughput sequencing.

syntax = "proto3";

package xxx.aptamers;

// A selection experiment can consists of one or more rounds. The dependencies
// between rounds are indicated by Round.input.
// NextId: 14
message Experiment {
  // Keys give the name of rounds.
  map<string, Round> rounds = 1;

  // Absolute path to directory in which reads from this experiment are saved.
  string base_directory = 2;

  // Fixed length of aptamer sequences in this experiment.
  // This information should be redundant with the actual reads. But it's nice
  // not to actually have to look at sequencing data to set up modeling code.
  int64 sequence_length = 3;

  // Fixed length of aptamer sequences in this experiment.
  // This is the length that was sequenced for the reverse read.
  // Ideally it would be the same as the forward read so that every base in the
  // reverse read could be compared with the forward read during quality checks.
  // In some cases though, the sequencer was set to read a different length
  // in the reverse read than the forward read.
  // This information should be redundant with the actual reads. But it's nice
  // not to actually have to look at sequencing data to set up modeling code.
  int64 reverse_sequence_length = 13;

  // (Optional) template for generating random sequences from this experiment,
  // in the form of a string of base pairs 'A', 'T', 'G', 'C' or 'N' of length
  // `sequence_length`, where 'N' represents an unknown base that is equally
  // likely to be any other base. By default, code should assume a template of
  // the form `'N' * sequence_length`.
  string template_sequence = 8;

  // True if the data specifies the secondary structure partition function
  // value for each aptamer.
  bool has_partition_function = 9;

  // The string sequence of the forward primer region. This region is constant
  // across all aptamers in the experiment.
  // To calculate the full sequence of an aptamer during selection, prepend this
  // forward primer to the aptamer_sequence.
  string forward_primer = 5;

  // The string sequence of the reverse primer region. This region is constant
  // across all aptamers in the experiment.
  // To calculate the full sequence of an aptamer during selection, append this
  // reverse primer after the aptamer_sequence.
  // Note that this sequence is on the same strand as the variable region
  // (i.e. the sequence is appended directly. For the actual primer sequence
  // used during PCR, you would take the reverse complement of this sequence).
  string reverse_primer = 6;

  // A list of additional output.
  repeated AdditionalOutput additional_output = 10;

  // The dimension of the sequence embedding
  int64 embedding_dim = 11;

  // The name prefix of the features that correspond to the sequence embedding
  string embedding_columns_prefix = 12;
}

// We define a selection round very loosely, as anything that produces a new
// pool of aptamers followed by optional high throughput sequencing. Thus
// creating a random initial library and replicating a pool using PCR count as
// rounds, even though they aren't doing any active selection.
message Round {
  // If defined, should refer to the name of another Round in this Experiment.
  string input = 1;

  // Sequencing reads of aptamers selected by this round.
  SequencingReads positive_reads = 2;

  // Only used for methods like particle display that preserve rejected
  // sequences.
  SequencingReads negative_reads = 3;

  // Method for selection in this round.
  enum Method {
    INVALID = 0;
    RANDOM = 1;
    PCR = 2;
    PARTICLE_DISPLAY = 3;
    SELEX = 4;
  }
  Method method = 4;

  // If a method involves selection of some sort (i.e., PARTICLE_DISPLAY and
  // SELEX), it must specify concentrations for all target and relevant
  // background molecules.
  //
  // Concentrations may use arbitrary units, but should be consistent within a
  // given Experiment. If the exact concentration is unknown or irrelevant but
  // the presence of a molecule is important information for machine learning
  // models, assign that concentration the value 1. A concentration of 0 should
  // be treated equivalently to missing keys.
  //
  // Keys in this field indicate the name of molecules. For Experiments with
  // only a single target, generic names such as "target" and "serum" may be
  // appropriate.
  map<string, float> target_concentrations = 5;
  map<string, float> background_concentrations = 6;
}

// Desription of high-throughput DNA sequencing reads. The reads themselves may
// be very large, so they're stored in separate files (for now, FASTQ).
message SequencingReads {
  // Forward and reverse reads should be paired.
  // These paths are relative to base_directory on the Experiment proto.
  repeated string fastq_forward_path = 2;
  repeated string fastq_reverse_path = 3;

  // Name of the column in the output that stores counts from this read.
  string name = 4;

  // Unique integer identifying these measurements in the Experiment.
  // Used as an identifier for reads instead of name in sparse data
  // structures. Number IDs from 1 instead of 0 to avoid the default value.
  int32 measurement_id = 5;

  // Statistics describing the number of counts for each unique sequence from
  // these reads. These are computed automatically as part of pre-processing and
  // used for normalizing outputs in our TensorFlow models.
  CountStatistics statistics = 6;

  // DEPRECATED: Exact number of reads. Use statiistics.total_depth instead.
  int64 depth = 1;
}

// Scalar statistics for the counts obtained from sequencing reads.
message CountStatistics {
  int64 total_depth = 1;
  int64 num_uniques = 2;
  float mean = 3;
  float std_dev = 4;
  float mean_log_plus_one = 5;
  float std_dev_log_plus_one = 6;
}

message AdditionalOutput {
  // Name of the column in the output that stores these measurements.
  string name = 1;

  // Unique integer identifying these measurements in the Experiment.
  // Used as an identifier instead of name in sparse data structures. Number IDs
  // from 1 instead of 0 to avoid the default value.
  int32 measurement_id = 2;

  // Statistics describing the additional output. These are computed
  // automatically as part of pre-processing and used for normalizing outputs in
  // our TensorFlow models.
  AdditionalOutputStatistics statistics = 3;
}

// Scalar statistics for the additional output
message AdditionalOutputStatistics {
  float mean = 1;
  float std_dev = 2;
  float mean_log_plus_one = 3;
  float std_dev_log_plus_one = 4;
}
